Data

Which years are missing a metric

dat %>% 
  ggplot(aes(year, lakeid, fill=is.na(daynum))) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~metric, nrow=6) 

Remaining data issues:

  • Northern lakes don’t have DOC until until 1986; missing first 4 years when all other variables are available. Potential fixes:
    • Exclude DOC; start analyses in 1986; fill DOC with e.g., median value, or model of other variables?
  • Southern lakes missing chlorophyll data: 1996 - 1999 and 2002 - 2004; 2002-04 is bad/uncalibrated values, used DOY but not magnitude; earlier years fill with median
  • Zoop data issues:
    • CB missing daphnia peak 1995(?): fill with median
    • FI missing several years from 1996 - 2006; fill with median

Predict DOC daynum from other variable daynum’s in each northern lake

no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>% 
  select(-sampledate) %>% 
  pivot_wider(names_from = metric, values_from = daynum) %>% 
  filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>%  ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start:  AIC=1760.48
## doc_epiMax ~ 1
## 
##                 Df Sum of Sq    RSS    AIC
## + lakeid         6     75207 392168 1732.0
## + stratoff       1     12973 454402 1756.0
## <none>                       467375 1760.5
## + straton        1      1124 466251 1761.9
## + iceon          1       606 466769 1762.2
## + secchi_max     1       606 466769 1762.2
## + energy         1       436 466939 1762.3
## + stability      1       208 467167 1762.4
## + anoxia_summer  1       179 467196 1762.4
## 
## Step:  AIC=1731.95
## doc_epiMax ~ lakeid
## 
##                 Df Sum of Sq    RSS    AIC
## + stratoff       1      3892 388276 1731.7
## <none>                       392168 1732.0
## + anoxia_summer  1      3132 389036 1732.1
## + energy         1      2445 389723 1732.5
## + straton        1       530 391638 1733.6
## + iceon          1       510 391658 1733.7
## + stability      1       394 391774 1733.7
## + secchi_max     1       328 391840 1733.8
## - lakeid         6     75207 467375 1760.5
## 
## Step:  AIC=1731.65
## doc_epiMax ~ lakeid + stratoff
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         388276 1731.7
## + anoxia_summer    1      3273 385004 1731.7
## + energy           1      3036 385240 1731.8
## - stratoff         1      3892 392168 1732.0
## + straton          1       649 387627 1733.3
## + stability        1       603 387673 1733.3
## + iceon            1       374 387903 1733.4
## + secchi_max       1       247 388029 1733.5
## + stratoff:lakeid  6      6943 381333 1739.5
## - lakeid           6     66125 454402 1756.0
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_max)*lakeid, 
               data=n_lakes_wide, 
               na.action = na.exclude)
summary(doc_model)
## 
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy + 
##     stability + anoxia_summer + secchi_max) * lakeid, data = n_lakes_wide, 
##     na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -150.321  -17.397    2.832   23.126  119.066 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              77.29634  170.93863   0.452   0.6517  
## iceon                     0.25799    0.36315   0.710   0.4784  
## straton                   0.22299    0.36107   0.618   0.5376  
## stratoff                  0.44312    0.31556   1.404   0.1620  
## energy                   -0.20198    0.22429  -0.901   0.3690  
## stability                -0.14405    0.14826  -0.972   0.3326  
## anoxia_summer            -0.04084    0.17612  -0.232   0.8169  
## secchi_max               -0.07579    0.08286  -0.915   0.3616  
## lakeid.L                319.71870  449.77873   0.711   0.4781  
## lakeid.Q                357.93277  440.74822   0.812   0.4178  
## lakeid.C                235.35696  456.57259   0.515   0.6068  
## lakeid^4                129.67020  457.05136   0.284   0.7770  
## lakeid^5               -416.46790  463.26683  -0.899   0.3699  
## lakeid^6                295.39872  445.76920   0.663   0.5084  
## iceon:lakeid.L           -1.09045    0.96020  -1.136   0.2576  
## iceon:lakeid.Q           -0.62871    0.94821  -0.663   0.5082  
## iceon:lakeid.C           -0.47248    0.95510  -0.495   0.6214  
## iceon:lakeid^4            0.19702    0.99387   0.198   0.8431  
## iceon:lakeid^5            0.44611    0.94996   0.470   0.6392  
## iceon:lakeid^6           -0.24304    0.95668  -0.254   0.7997  
## straton:lakeid.L         -0.99644    0.96684  -1.031   0.3041  
## straton:lakeid.Q         -0.06648    0.88073  -0.075   0.9399  
## straton:lakeid.C          0.21534    0.97934   0.220   0.8262  
## straton:lakeid^4          1.02889    1.02504   1.004   0.3168  
## straton:lakeid^5          1.52519    0.99168   1.538   0.1258  
## straton:lakeid^6          0.39826    0.87856   0.453   0.6509  
## stratoff:lakeid.L        -0.22517    0.84243  -0.267   0.7896  
## stratoff:lakeid.Q        -0.27602    0.75215  -0.367   0.7141  
## stratoff:lakeid.C         0.17399    0.84559   0.206   0.8372  
## stratoff:lakeid^4        -0.52386    0.94017  -0.557   0.5781  
## stratoff:lakeid^5         0.87622    0.84874   1.032   0.3033  
## stratoff:lakeid^6        -1.22688    0.76657  -1.600   0.1112  
## energy:lakeid.L           0.35302    0.59698   0.591   0.5550  
## energy:lakeid.Q          -0.31656    0.55245  -0.573   0.5674  
## energy:lakeid.C          -0.19354    0.59750  -0.324   0.7464  
## energy:lakeid^4          -0.21917    0.64995  -0.337   0.7364  
## energy:lakeid^5          -0.43207    0.59803  -0.722   0.4709  
## energy:lakeid^6           0.09195    0.56057   0.164   0.8699  
## stability:lakeid.L       -0.07321    0.34534  -0.212   0.8324  
## stability:lakeid.Q        0.22801    0.37854   0.602   0.5477  
## stability:lakeid.C       -0.11656    0.39617  -0.294   0.7689  
## stability:lakeid^4       -0.45500    0.34578  -1.316   0.1899  
## stability:lakeid^5       -0.14418    0.44118  -0.327   0.7442  
## stability:lakeid^6       -0.04854    0.43532  -0.112   0.9113  
## anoxia_summer:lakeid.L    0.29492    0.47623   0.619   0.5365  
## anoxia_summer:lakeid.Q    0.16496    0.46771   0.353   0.7247  
## anoxia_summer:lakeid.C   -0.07344    0.45345  -0.162   0.8715  
## anoxia_summer:lakeid^4   -0.19684    0.50500  -0.390   0.6972  
## anoxia_summer:lakeid^5   -0.16585    0.42945  -0.386   0.6998  
## anoxia_summer:lakeid^6    0.05669    0.46058   0.123   0.9022  
## secchi_max:lakeid.L       0.50747    0.23469   2.162   0.0319 *
## secchi_max:lakeid.Q      -0.44115    0.23547  -1.873   0.0626 .
## secchi_max:lakeid.C      -0.17325    0.22536  -0.769   0.4430  
## secchi_max:lakeid^4      -0.07064    0.19545  -0.361   0.7182  
## secchi_max:lakeid^5      -0.14772    0.21562  -0.685   0.4942  
## secchi_max:lakeid^6       0.13562    0.20575   0.659   0.5106  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.9 on 180 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.2926, Adjusted R-squared:  0.07644 
## F-statistic: 1.354 on 55 and 180 DF,  p-value: 0.072
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)

cor = n_lakes_wide %>% 
  group_by(lakeid) %>% 
  summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))

n_lakes_wide %>% 
  ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
  geom_point()+
  theme_bw() +
  labs(color="lakeid")+
  geom_abline(slope=1, intercept = 0) + 
  facet_wrap(~lakeid) + 
  geom_text(aes(label=r), data=cor, x=300, y=0) +
  labs(x="obs peak DOC (daynum)",  y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).

# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))

missing_doc$doc_fill = round(predict(doc_model, missing_doc))

doc_fill = missing_doc %>% 
  select(lakeid, year, doc_fill) %>% 
  rename(daynum_fill = doc_fill) %>% 
  mutate(metric = "doc_epiMax") %>% 
  filter(year < 2000)

Create filled metric column and filled

all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid, 
                    levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"), 
                    ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_max","secchi_min", "zoopDensity","zoopDensity_CC",
        "doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin", 
        "anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
                    levels = rev(vars),
                    ordered=T)

dat_filled %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

## Additional trimming/filling

Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric

df_yearsWant = dat_filled %>% 
  filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
           (!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))

fi_icefill = df_yearsWant %>% 
  filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>% 
  mutate(lakeid = "FI") %>% 
  select(-daynum, -filled_flag, -sampledate) %>% 
  rename(icefill = daynum_fill)

df_yearsWant = df_yearsWant %>% 
  full_join(fi_icefill) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill), 
                              icefill, daynum),
         filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill), 
                              TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

Final fill w/ medians

all_lys = df_yearsWant %>% 
  select(lakeid, year) %>% 
  distinct()

metric = unique(df_yearsWant$metric)

all_lys_want = expand_grid(all_lys, metric)

dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>% 
  group_by(lakeid, metric) %>% 
  summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>% 
  left_join(medians) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>% 
  select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%  
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")